
clear all
set more off

global dir /Volumes/Zihao_SSD2/PatentsView

*** This part cleans patent information
*** Zihao Li. 06/2024

*** Clean CPC
import delimited $dir/rawdata/g_cpc_current.tsv, clear
gen main_cpc_section = ""
gen main_cpc_class = ""
gen main_cpc_subclass = ""
sort patent_id cpc_sequence
replace main_cpc_section = cpc_section if cpc_sequence == 0
replace main_cpc_class = cpc_class if cpc_sequence == 0
replace main_cpc_subclass = cpc_subclass if cpc_sequence == 0
bysort patent_id: replace main_cpc_section = main_cpc_section[_n-1] if _n > 1
bysort patent_id: replace main_cpc_class = main_cpc_class[_n-1] if _n > 1
bysort patent_id: replace main_cpc_subclass = main_cpc_subclass[_n-1] if _n > 1

keep patent_id cpc_sequence cpc_subclass main_cpc_section main_cpc_class main_cpc_subclass
reshape wide cpc_subclass, i(patent_id) j(cpc_sequence)
gen cpc_list = cpc_subclass0
forval y = 1/330 {
	replace cpc_list = cpc_list + "; " + cpc_subclass`y' if !missing(cpc_subclass`y') 
	drop cpc_subclass`y'
}
format %60s cpc_list
drop cpc_subclass0
tostring patent_id, replace
save $dir/temp/g_cpc_patentlevel.dta, replace


*** Clean Assignee
import delimited $dir/rawdata/g_assignee_disambiguated.tsv, clear bindquote(strict)
drop v5 disambig_assignee_individual_nam
format %30s patent_id assignee_id disambig_assignee_organization location_id
drop if assignee_sequence != 0
drop assignee_sequence
rename (disambig_assignee_organization location_id) (assignee_organization assignee_location_id)
save $dir/temp/g_assignee_patentlevel.dta, replace

* Append Assignee Location
import delimited $dir/rawdata/g_location_disambiguated.tsv, clear
rename (location_id disambig_city disambig_state disambig_country latitude longitude county state_fips county_fips) (assignee_location_id assignee_city assignee_state assignee_country assignee_latitude assignee_longitude assignee_county assignee_state_fips assignee_county_fips)

merge 1:m assignee_location_id using $dir/temp/g_assignee_patentlevel.dta
drop if _merge == 1
drop _merge
order patent_id assignee_id assignee_organization assignee_type assignee_country assignee_state assignee_city
save $dir/temp/g_assignee_patentlevel.dta, replace


*** This part merges g_patent.tsv with cpc and assignee info
import delimited $dir/rawdata/g_patent.tsv, clear bindquote(strict)
drop patent_abstract
gen str20 patent_id_new = patent_id
assert patent_id_new == patent_id
drop patent_id
rename patent_id_new patent_id

* Create date variables
split patent_date, parse("-")
rename patent_date1 patent_year
rename patent_date2 patent_month
rename patent_date3 patent_day
destring patent_year, replace
drop if patent_year == .
destring patent_day, replace

*** Export the patent_id--year correspondence. 
preserve
keep patent_id patent_year
duplicates drop patent_id, force
save $dir/temp/patid_year.dta, replace
restore

merge 1:1 patent_id using $dir/temp/g_cpc_patentlevel.dta
drop _merge
sort patent_year patent_id
format %20s patent_type

merge 1:1 patent_id using $dir/temp/g_assignee_patentlevel.dta
drop _merge
order patent_id patent_date patent_year assignee_organization assignee_type main_cpc_section main_cpc_class main_cpc_subclass assignee_country assignee_state assignee_city
sort patent_year patent_id
format %30s assignee_city
format %10s assignee_country assignee_state

save $dir/temp/g_patent_clean.dta, replace





*** =============================================================================================================
*** Part 2: appends for each patent:
*** 1. number of inventors (total/male/female)
*** 2. gender/race/birthyear/experience of lead inventor
*** 3. birthyear/experience of oldest/youngest inventors
*** 4. average birthyear/experience of inventor team
*** =============================================================================================================
global dir /Volumes/Zihao_SSD2/PatentsView

use $dir/cleandata/g_inventor_gender_race_age.dta, clear
keep patent_id num_inventors num_inventors_m num_inventors_f lead_gender lead_gender_ind lead_gender_prob lead_gender_count lead_gender_09_0 lead_gender_09_50 lead_gender_09_100 lead_gender_io_09_100  lead_gender_08_50 lead_gender_08_100 lead_gender_io_08_100 lead_gender_io_07_100 lead_gender_io_06_100 lead_gender_io_05_100 lead_race lead_race90 lead_race80 lead_race70 lead_experience oldest_experience youngest_experience avg_experience lead_birthyear oldest_birthyear youngest_birthyear avg_birthyear

duplicates drop patent_id, force
merge 1:1 patent_id using $dir/temp/g_patent_clean.dta // 8,256,143
drop if _merge != 3
drop _merge patent_title filename

* Generate age variables of inventors
gen lead_age = patent_year - lead_birthyear
gen oldest_age = patent_year - oldest_birthyear
gen youngest_age = patent_year - youngest_birthyear
gen avg_age = patent_year - avg_birthyear

order patent_id patent_date patent_year assignee_organization assignee_country assignee_type lead_gender_09_100 lead_gender_ind lead_gender_count lead_gender_prob lead_gender_io_09_100 lead_race80 lead_experience oldest_experience youngest_experience lead_age oldest_age youngest_age avg_age main_cpc_section main_cpc_class main_cpc_subclass

replace assignee_organization = subinstr(assignee_organization, char(10), " ", .)

export delimited using $dir/cleandata/g_patent_clean.csv, replace


*** Centrality and number of coauthors of lead inventor
import delimited $dir/cleandata/centrality_sum_1981_2015.csv, clear 
gen len = length(inventor_id)
drop if len>50
recast str50 inventor_id
drop len
gen log_deg_centrality = log(deg_centrality)
save $dir/temp/centrality_sum_1981_2015.dta, replace

use $dir/cleandata/g_inventor_gender_race_age.dta, clear
keep patent_id patent_year inventor_id inventor_sequence
rename patent_year year
gen temp_non_numeric = regexm(patent_id, "[^0-9]")
drop if temp_non_numeric == 1
destring patent_id, replace
drop temp_non_numeric
drop if year < 1981 | year > 2015
merge m:1 inventor_id year using $dir/temp/inventor_coauthor.dta
drop _merge
merge m:1 inventor_id year using $dir/temp/centrality_sum_1981_2015.dta // 9,094,345
drop if _merge==2
drop _merge
drop if inventor_sequence !=0 // keep lead-inventor
sort patent_id
keep patent_id deg_centrality log_deg_centrality cumulative_coauthors

save $dir/temp/leadinventor_info.dta, replace

